Supplementary Data File 3 -
This is the THIRD script used to generate the main figures for the the manuscript by Proctor et al. titled “A spatial gradient of bacterial diversity in the human oral cavity shaped by salivary flow”
Given attribution, you are free to: 1) Share, copy and redistribute the material in any medium or format 2) Adapt, remix, transform, and build upon the material for any purpose, even commercially.
To see the full license associated with attribution of this work, see the CC-By-CA license, see http://creativecommons.org/licenses/by-sa/4.0/.
Run the decontamination script (Supplementary Data File 1) first. Then run this one. Most of the supplemental figures are in an accompanying files (Supplementary Data Files 3-4). The local name of this file on my desktop is: Proctor_MainFigures_v28.0.Rmd.
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
##
## distance
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:phyloseq':
##
## sampleNames
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following objects are masked from 'package:genefilter':
##
## rowSds, rowVars
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
##
## apply
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 480 taxa and 7002 samples ]
## sample_data() Sample Data: [ 7002 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 480 taxa by 7 taxonomic ranks ]
load("~/Desktop/Proctor_NatureComm/mouth.RData")
discovery = mouthPhy
#how many samples and taxa
discovery
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 724 taxa and 1905 samples ]
## sample_data() Sample Data: [ 1905 samples by 27 sample variables ]
## tax_table() Taxonomy Table: [ 724 taxa by 6 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 724 tips and 723 internal nodes ]
#what is the sequencing depth?
summary(sample_sums(discovery))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1539 2258 2259 2931 11181
#how many subjects are in this dataset?
length(levels(as.factor(sample_data(discovery)$Subject)))
## [1] 11
#how many samples are in the validation dataset?
nsamples(supra)
## [1] 7002
#how many subjects are in the validation dataset?
length(levels(sample_data(supra)$Subject))
## [1] 19
#how many healthy aim 1 subjects are in the validation dataset?
aim1 = subset_samples(supra, Aim=="SA1" | Aim=="Pilot")
length(levels(sample_data(aim1)$Subject))
## [1] 9
#how many healthy aim 3 subjects are in the validation dataset?
aim3 = subset_samples(supra, Aim=="SA3")
length(levels(sample_data(aim3)$Subject))
## [1] 10
#what is the median sequencing depth of the validation dataset?
summary(sample_sums(supra))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 61920 73772 70563 84410 213919
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.4-4
load("~/Desktop/Proctor_NatureComm/Biogeo1_phys.Rdata")
#fix the sample names in the Biogeo object
names <- sample_names(Biogeo1_phys)
names <- str_replace_all(names, ("aaa"), ".")
names <- str_replace_all(names, ("zzz"), ".")
sample_names(Biogeo1_phys) <- names
#use version 5 of the mapping file, includes coordinates
map <- import_qiime(mapfilename="~/Desktop/Proctor_NatureComm/biogeo_mapping_file_corrected_v5.0.txt")
## Processing map file...
sample_data(Biogeo1_phys) <- map
sites = c("AG", "BM", "KG", "Supra")
#subset on the sites analyzed in this study
Biogeo2_phys <- subset_samples(Biogeo1_phys, Site.New %in% sites)
sample_data(Biogeo2_phys)$TissueClass = ifelse(sample_data(Biogeo2_phys)$Site.New =="Supra", "Non-shedding ", "Shedding ")
#note that alveolar mucosa and keratinized gingiva sites are mislabeled; switch them
### Fix the other sample site labels
sample_data(Biogeo2_phys)$Site.New = str_replace_all(sample_data(Biogeo2_phys)$Site.New, "(AG)", "Keratinized Gingiva ")
sample_data(Biogeo2_phys)$Site.New = str_replace_all(sample_data(Biogeo2_phys)$Site.New, "(BM)", "Buccal Mucosa ")
sample_data(Biogeo2_phys)$Site.New = str_replace_all(sample_data(Biogeo2_phys)$Site.New, "(KG)", "Alveolar Mucosa ")
sample_data(Biogeo2_phys)$Site.New = str_replace_all(sample_data(Biogeo2_phys)$Site.New, "(Supra)", "Supragingival Tooth ")
#how many samples are in the mucosal biogeo dataset?
nsamples(Biogeo2_phys)
## [1] 542
#library(dada2)
#sequence table from rdp classifier on rdp database
#tax = tax_table(Biogeo1_phys)
#set.seed(100) # Initialize random number generator for reproducibility
#taxa <- assignTaxonomy(rownames(tax), "~/Dropbox/rdp_train_set_14.fa.gz", minBoot=80)
#taxaSpecies <- addSpecies(taxa, "~/Dropbox/rdp_species_assignment_14.fa.gz", verbose=TRUE)
#tax_table(Biogeo1_phys) <- taxaSpecies
#blast some of the sequences and create a new variable column for the highest taxonomic rank
#write.csv(data.frame(tax_table(Biogeo1_phys), taxa_sums(Biogeo1_phys)), file="biogeo_species_v1.0.csv")
#read in the new taxonomic table
tax2 = read.csv("~/Desktop/Proctor_NatureComm/biogeo_species_v3.0.csv", header=TRUE, row.names=1)
tax3 = tax_table(tax2)
## Warning in .local(object): Coercing from data.frame class to character matrix
## prior to building taxonomyTable.
## This could introduce artifacts.
## Check your taxonomyTable, or coerce to matrix manually.
taxa_names(tax3) = rownames(tax2)
colnames(tax3) = colnames(tax2)
tax_table(Biogeo2_phys) = tax3
#relabel with the highest rank designation
taxa_names(Biogeo2_phys) = tax_table(Biogeo2_phys)[,8]
#filter to retain taxa present in 15% of samples
Biogeo2_phys15 = filter_taxa(Biogeo2_phys, function(x) sum(x > 10) > (0.15 * length(x)), TRUE)
Biogeo2_phys15 = prune_taxa(taxa_sums(Biogeo2_phys15) > 0, Biogeo2_phys15)
Biogeo2_phys15 = prune_samples(sample_sums(Biogeo2_phys15) > 0, Biogeo2_phys15)
Biogeo2_phys15
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 101 taxa and 542 samples ]
## sample_data() Sample Data: [ 542 samples by 27 sample variables ]
## tax_table() Taxonomy Table: [ 101 taxa by 12 taxonomic ranks ]
### Set up several otu tables with different transformations
#raw data
#vst-transformed data
### Stabilize the variance
#create a deseq object
Biogeo2_VST <- Biogeo2_phys15
otu_table(Biogeo2_VST) <- otu_table(Biogeo2_VST) + 1
Biogeo2.ds = phyloseq_to_deseq2(Biogeo2_VST, ~Subject + Site.New)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#variance stabilizing transformation
Biogeo2_phys15_VST <- varianceStabilizingTransformation(Biogeo2.ds , blind=TRUE, fitType="local")
biogeo_counts_VST <- otu_table(as.matrix(assay(Biogeo2_phys15_VST)), taxa_are_rows=TRUE)
otu_table(Biogeo2_VST) <- biogeo_counts_VST
otus.VST = data.frame(t(otu_table(Biogeo2_VST)))
#how many samples are in the mucosal biogeo dataset?
nsamples(Biogeo2_phys)
## [1] 542
#how many supra samples and mucosal samples are there
sb = data.frame(sample_data(Biogeo2_phys))
table(sb$Site.New)
##
## Alveolar Mucosa Buccal Mucosa Keratinized Gingiva
## 126 82 166
## Supragingival Tooth
## 168
#how many mucosal samples are there
sb = subset(sb, Site.New != "Supragingival Tooth ")
sum(table(sb$Site.New))
## [1] 374
#how many subjects / samples per subject are in the mucosal biogeo dataset?
table(sb$Subject)
##
## 4-101 4-102 4-103
## 124 124 126
#what is the median sequencing depth of the mucosal biogeo dataset?
summary(sample_sums(Biogeo2_phys))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 70991 82354 77684 91721 123925
#how many taxa are in the decontaminated validation data
supra
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 480 taxa and 7002 samples ]
## sample_data() Sample Data: [ 7002 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 480 taxa by 10 taxonomic ranks ]
nsamples(discovery)+nsamples(Biogeo2_phys)+nsamples(supra)
## [1] 9449
#What is the relative abundance of each Phylum?
library(doBy)
supra.phy <- tax_glom(supra, taxrank="Phylum")
tax.count <- data.frame(tax_table(supra.phy)[,2:9], taxa_sums(supra.phy))
colnames(tax.count) <- c("Kingdom", "Phylum","Class","Order","Family","Genus", "Species", "Highest", "Abundance")
Phylum_df <- summaryBy(Abundance~Phylum, data=tax.count, FUN=sum)
Phylum_df $Percent <- round(Phylum_df $Abundance.sum/sum(Phylum_df $Abundance)*100, 4)
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:IRanges':
##
## desc
## The following object is masked from 'package:S4Vectors':
##
## rename
Phylum_df <- arrange(Phylum_df, desc(Percent))
Phylum_df
## Phylum Abundance.sum Percent
## 1 Firmicutes 196601153 39.8095
## 2 Proteobacteria 139356937 28.2182
## 3 Actinobacteria 108027071 21.8742
## 4 Bacteroidetes 34477486 6.9813
## 5 Fusobacteria 14858582 3.0087
## 6 SR1 340398 0.0689
## 7 Spirochaetes 121010 0.0245
## 8 Cyanobacteria/Chloroplast 52172 0.0106
## 9 Candidatus_Saccharibacteria 10815 0.0022
## 10 Aquificae 3559 0.0007
## 11 Acidobacteria 2799 0.0006
## 12 Deinococcus-Thermus 1701 0.0003
## 13 Tenericutes 1296 0.0003
write.csv(Phylum_df, file="~/Desktop/TableS1.csv")
#how much do the top 5 phyla contribute to total abundance?
top5 <- Phylum_df[1:5, ]
round(sum(top5$Percent),2)
## [1] 99.89
#What are the top 5 Phyla?
top5
## Phylum Abundance.sum Percent
## 1 Firmicutes 196601153 39.8095
## 2 Proteobacteria 139356937 28.2182
## 3 Actinobacteria 108027071 21.8742
## 4 Bacteroidetes 34477486 6.9813
## 5 Fusobacteria 14858582 3.0087
#What is the number of Genera found?
length(get_taxa_unique(supra, taxonomic.rank="Genus"))
## [1] 118
#what are the abundance levels of each genus?
supra.genus <- tax_glom(supra, taxrank="Genus")
tax.count <- data.frame(tax_table(supra.genus)[,2:9], taxa_sums(supra.genus))
colnames(tax.count) <- c("Kingdom", "Phylum","Class","Order","Family","Genus", "Species", "Highest", "Abundance")
Genus_df<- summaryBy(Abundance~Genus, data=tax.count, FUN=sum)
Genus_df$Percent <- round(Genus_df$Abundance.sum/sum(Genus_df$Abundance)*100, 4)
library(plyr)
Genus_df <- arrange(Genus_df, desc(Percent))
Genus_df <- Genus_df[order(Genus_df$Percent, decreasing=TRUE), ]
#how much do the top 10 genera contribute to total abundance?
top10 <- Genus_df[1:10, ]
round(sum(top10$Percent),3)
## [1] 88.198
#what about the top 5
top5 <- Genus_df[1:5, ]
round(sum(top5$Percent),3)
## [1] 72.43
###How diverse are the top 10 genera? i.e., how many species are there per genus?
top10 <- as.vector(Genus_df$Genus[1:10])
Diversity.list <- vector("list", 10)
names(Diversity.list) <- top10
for(i in 1:length(top10)){
physub = subset_taxa(supra, Genus==top10[i])
physub = prune_taxa(taxa_sums(physub) > 0, physub)
Diversity.list[[i]] <- physub
}
#compute the number of taxa in each element of the list
Ntaxa <- data.frame(unlist(lapply(Diversity.list, ntaxa)))
colnames(Ntaxa) <- "N.Species"
#Make a table with percent abundance and number of taxa
genus.tab <- data.frame(Genus_df[1:10,], Ntaxa)
library(knitr)
kable(genus.tab, format="markdown")
| Genus | Abundance.sum | Percent | N.Species |
|---|---|---|---|
| Streptococcus | 143337443 | 31.6955 | 11 |
| Haemophilus | 64073059 | 14.1682 | 6 |
| Rothia | 61438005 | 13.5855 | 6 |
| Neisseria | 32952227 | 7.2866 | 10 |
| Actinomyces | 25749541 | 5.6939 | 17 |
| Corynebacterium | 19371708 | 4.2836 | 4 |
| Veillonella | 17758620 | 3.9269 | 10 |
| Abiotrophia | 11542724 | 2.5524 | 1 |
| Gemella | 11334234 | 2.5063 | 2 |
| Prevotella | 11301369 | 2.4990 | 46 |
write.csv(genus.tab, file="~/Desktop/TableS2.csv")
library(RColorBrewer)
### Create an index tooth biogeography subset
index <- subset_samples(supra, Protocol=="Home")
sites <- c("Molar", "Incisor_Central")
index_subsites <- subset_samples(index, Tooth_Class %in% sites)
index_subsites
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 480 taxa and 3400 samples ]
## sample_data() Sample Data: [ 3400 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 480 taxa by 10 taxonomic ranks ]
#how many samples are in average 2 subjects? Looks like about 25% of samples
mean(table(sample_data(index_subsites)$Subject))
## [1] 425
#filter the taxa to include those present in 25% of samples
filtergroup = filterfun(kOverA(k=850, A=1)) #k = number of samples; A = abundance
index25 = filter_taxa(index_subsites, filtergroup, prune=TRUE)
index25 = prune_samples(sample_sums(index25) > 0, index25)
#note the following command drops 7 samples (depth between 1-35)
index25 = prune_samples(sample_sums(index25) > 800, index25)
index25
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 90 taxa and 3393 samples ]
## sample_data() Sample Data: [ 3393 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 90 taxa by 10 taxonomic ranks ]
#fix the labeling of the subject ids
#relabel for prettier plotting
sample_data(index25)$Tooth_Class <- str_replace_all(sample_data(index25)$Tooth_Class, "(Incisor_Central)", "Incisor")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-101)", "Subject 1")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-102)", "Subject 2")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-104)", "Subject 3")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-105)", "Subject 4")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-106)", "Subject 5")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-107)", "Subject 6")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(P1-8)", "Subject 7")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(P1-9)", "Subject 8")
#force the labeling of the subjects
ordering= c("Subject 1", "Subject 2", "Subject 3" , "Subject 4" , "Subject 5" ,"Subject 6", "Subject 7" ,"Subject 8")
sample_data(index25)$Subject <- factor(sample_data(index25)$Subject, levels = ordering)
#hellinger transform the data
otus = data.frame(otu_table(index25))
library(vegan)
otus.h = decostand(otus, "hellinger")
#Do PCOA
#nmds on bray curtis distance of hellinger transformation
h.euc = vegdist(otus.h, "bray")
h.nmds = cmdscale(h.euc, eig=TRUE, k=10)
h.map = data.frame(sample_data(index25), scores(h.nmds))
h.map$method = "Hellinger"
h.eig = 100*(h.nmds$eig/sum(h.nmds$eig))
head(h.eig)
## [1] 23.478257 10.985589 9.710830 7.991654 6.795043 5.868218
#make a screeplot
evals <- h.nmds$eig
foo = data.frame(h.eig, as.numeric(1:length(h.eig)))
colnames(foo) = c("percent.variation", "rank")
p=ggplot(foo[1:10,], aes(rank, percent.variation)) + geom_point() + theme_bw()
p
#look at all the data as a function of tooth class
p1=plot_ordination(index25, h.nmds, type="samples", color="Tooth_Class") + geom_point(size=2, alpha=0.1) + theme_bw() +theme(legend.position = "none") + theme(text = element_text(size=18), axis.text.x = element_text(angle=0, vjust=1))+coord_fixed(sqrt(evals[2] / evals[1]))
p.dat = p1$data
##### plot figure panels
#1a
fig1a=plot_ordination(index25, h.nmds, type="samples", color="Subject") + geom_point(alpha=0.4) + theme_bw() + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) + scale_colour_brewer(palette="Dark2")+coord_fixed(sqrt(evals[2] / evals[1])) + ggtitle("a)") + xlab("Axis 1 (23.48%)") + ylab("Axis 2 (10.99%)")
#1b
fig1b=plot_ordination(index25, h.nmds, type="samples", color="Tooth_Class") + geom_point(alpha=0.4) + theme_bw() + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) +coord_fixed(sqrt(evals[2] / evals[1])) + guides(color=guide_legend(title="Tooth Class")) + ggtitle("b)") + xlab("Axis 1 (23.48%)") + ylab("Axis 2 (10.99%)")
#1c
fig1c=plot_ordination(index25, h.nmds, type="samples", color="Tooth_Aspect") + geom_point(alpha=0.4) + theme_bw() + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) + scale_colour_brewer(palette="Set1") +coord_fixed(sqrt(evals[2] / evals[1]))+ guides(color=guide_legend(title="Tooth Aspect")) + ggtitle("c)") + xlab("Axis 1 (23.48%)") + ylab("Axis 2 (10.99%)")
#1d
fig1d <- ggplot(p.dat, aes(Tooth_Class, Dim1, fill=Tooth_Class)) + geom_boxplot(notch=TRUE) + facet_wrap(~Tooth_Aspect) + theme_bw() + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) + ylab("Axis 1 (23.48%)") + xlab("")+ guides(fill=guide_legend(title="Tooth Class")) + ggtitle("d)")
grid.arrange(fig1a, fig1b, fig1c, fig1d, ncol=2)
#save figure 1
ggsave(grid.arrange(fig1a, fig1b, fig1c, fig1d, ncol=2), file="~/Dropbox/Figure1.eps", width = 10, height = 8, units ="in",dpi = 300, device="eps")
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): semi-
## transparency is not supported on this device: reported only once per page
#how many comparisons are there in the boxplots
#buccal
b = subset(p.dat, Tooth_Aspect=="Buccal")
table(b$Tooth_Class)
##
## Incisor Molar
## 849 849
#lingual
l = subset(p.dat, Tooth_Aspect=="Lingual")
table(l$Tooth_Class)
##
## Incisor Molar
## 849 846
p1.dat = fig1d$data
#how many incisors have negative scores
test1 = subset(p1.dat, Tooth_Class=="Incisor" & Dim1 > 0)
testb = subset(p1.dat, Tooth_Class=="Incisor")
nrow(test1)/nrow(testb)
## [1] 0.6943463
#of incisors with + scores, many incisors have scores > than 0.25?
test2 = subset(p1.dat, Tooth_Class=="Incisor" & Dim1 > 0.25)
nrow(test2)/nrow(test1)
## [1] 0.259542
#how many molar communities have positive scores
test3 = subset(p1.dat, Tooth_Class=="Molar" & Dim1 > 0)
testb = subset(p1.dat, Tooth_Class=="Molar")
nrow(test3)/nrow(testb)
## [1] 0.2637168
#how many molars have scores > than 0.25
test4 = subset(p1.dat, Tooth_Class=="Molar" & Dim1 > 0.25)
testb = subset(p1.dat, Tooth_Class=="Molar" )
nrow(test4)/nrow(test3)
## [1] 0.006711409
## see how many lingual samples have scores less than 0.25 sicnce there seems to be an interaction with most lingual incisors falling below 0.25
test5 = subset(p1.dat, Tooth_Aspect=="Lingual" & Dim1 > 0)
testb = subset(p1.dat, Tooth_Aspect=="Lingual" )
nrow(test5)/nrow(testb)
## [1] 0.5451327
#merge the temporal replicates within each subject to avoid temporal pseudoreplication
subs <- levels(sample_data(index25)$Subject)
holder <-vector('list',length(subs))
for(i in 1:length(subs)){
subx = subset_samples(index25, Subject==subs[[i]])
subh = merge_samples(subx, "Habitat")
sample_data(subh)$Subject = subs[[i]]
sample_data(subh)$Habitat = sample_names(subh)
sample_names(subh) = paste0(sample_data(subh)$Subject, "_", sample_names(subh))
holder[[i]] = subh
}
is = merge_phyloseq(holder[[1]], holder[[2]], holder[[3]], holder[[4]], holder[[5]], holder[[6]], holder[[7]])
is2 = prune_taxa(taxa_sums(is) > 0, is)
is2 = prune_samples(sample_sums(is2) > 0, is2)
is2
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 90 taxa and 112 samples ]
## sample_data() Sample Data: [ 112 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 90 taxa by 10 taxonomic ranks ]
#set up a new mapping file to correct what was lost during the merge
foo = sample_names(is2)
foo2 <- str_replace_all(foo, "([B,L])", replacement =c(".Buccal", ".Lingual"))
foo2 <- str_replace_all(foo2, "_", ".")
map <- colsplit(foo2, "([.])", c("Subject", "Junk", "Tooth_Number", "Tooth_Aspect"))
map = data.frame(sample_data(is2)$Habitat, map)
colnames(map)[1] = "Habitat"
#merge in the mapping file information
dot = read.csv("~/Desktop/Proctor_NatureComm/mytoothdot_coordinates_FullMouth.csv")
colnames(dot)[1] = "Habitat"
#merge maps
library(plyr)
map2 = join(map, dot, by="Habitat")
#make the sample names of map2 the same as the names in bs2 then replace the mapping file
names = paste0(map2$Subject, "_", map2$Habitat)
rownames(map2) = names
sample_data(is2) = map2
#library(phyloseq)
library(vegan)
otus = data.frame(otu_table(is2))
otus.h = decostand(otus, "hellinger")
otus.euc = vegdist(otus.h, "euclidean")
attach(sample_data(is2))
set.seed(12)
index.adonis <- adonis(otus.euc ~Subject*Tooth_Class*Tooth_Aspect, permutations=1000, strata = Subject)
df <- format(data.frame(index.adonis$aov.tab), digits=2)
kable(df, format="markdown")
| Df | SumsOfSqs | MeanSqs | F.Model | R2 | Pr..F. | |
|---|---|---|---|---|---|---|
| Subject | 6 | 13.22 | 2.204 | 35.7 | 0.469 | 0.001 |
| Tooth_Class | 1 | 2.64 | 2.642 | 42.7 | 0.094 | 0.001 |
| Tooth_Aspect | 1 | 1.06 | 1.060 | 17.1 | 0.038 | 0.001 |
| Subject:Tooth_Class | 6 | 2.69 | 0.448 | 7.3 | 0.095 | 0.001 |
| Subject:Tooth_Aspect | 6 | 1.21 | 0.202 | 3.3 | 0.043 | 0.001 |
| Tooth_Class:Tooth_Aspect | 1 | 1.22 | 1.216 | 19.7 | 0.043 | 0.001 |
| Subject:Tooth_Class:Tooth_Aspect | 6 | 0.99 | 0.165 | 2.7 | 0.035 | 0.004 |
| Residuals | 84 | 5.19 | 0.062 | NA | 0.184 | NA |
| Total | 111 | 28.22 | NA | NA | 1.000 | NA |
#write.csv(df, file="~/Dropbox/Figures_20170617/Revised/FinalFigures/TableS3.csv")
detach(sample_data(is2))
#do we see a difference in sequencing depth by tooth location, no we don't for most subjects
df = data.frame(sample_sums(is2), sample_data(is2))
colnames(df)[1] = "abundance"
#does sequencing depth vary by tooth class on lingual surfaces?
ling= subset(df, Tooth_Aspect=="Lingual")
wilcox.test(abundance~Tooth_Class, data=ling, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test
##
## data: abundance by Tooth_Class
## W = 324, p-value = 0.271
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -219138 58331
## sample estimates:
## difference in location
## -73439
#whatabout buccal surfaces
buc= subset(df, Tooth_Aspect=="Buccal")
wilcox.test(abundance~Tooth_Class, data=buc, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test
##
## data: abundance by Tooth_Class
## W = 299, p-value = 0.1303
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -171951 26164
## sample estimates:
## difference in location
## -78530.5
mi= subset_samples(supra, Tooth_Class %in% c("Incisor_Central", "Molar"))
mi = prune_samples(sample_sums(mi) > 0, mi)
meths = c("Shannon", "Chao1", "Simpson")
sites = levels(as.factor(sample_data(mi)$Habitat))
holder <-vector('list',length(sites))
for(i in length(sites))
{
# Estimate diversity and make data frame
erDF <- estimate_richness(mi, split = TRUE, measures = meths)
df <- data.frame(erDF, sample_data(mi))
holder[[i]] <- df
}
df <- data.frame(do.call("rbind", holder))
#test to see whether alpha diversity varies based on tooth class
wilcox.test(Chao1~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Chao1 by Tooth_Class
## W = 1133700, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -24.00003 -21.99993
## sample estimates:
## difference in location
## -22.99995
wilcox.test(Shannon~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Shannon by Tooth_Class
## W = 1686900, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.2663786 -0.2123560
## sample estimates:
## difference in location
## -0.2393575
wilcox.test(Simpson~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Simpson by Tooth_Class
## W = 1985700, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.02320114 -0.01541780
## sample estimates:
## difference in location
## -0.01931219
mi= subset_samples(mi, Protocol=="Clinic")
mi = prune_samples(sample_sums(mi) > 0, mi)
mi
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 480 taxa and 984 samples ]
## sample_data() Sample Data: [ 984 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 480 taxa by 10 taxonomic ranks ]
meths = c("Shannon", "Chao1", "Simpson")
sites = levels(as.factor(sample_data(mi)$Habitat))
holder <-vector('list',length(sites))
for(i in length(sites))
{
# Estimate diversity and make data frame
erDF <- estimate_richness(mi, split = TRUE, measures = meths)
df <- data.frame(erDF, sample_data(mi))
holder[[i]] <- df
}
df <- data.frame(do.call("rbind", holder))
#test to see whether there are two class differences
#test to see whether alpha diversity varies based on tooth class
wilcox.test(Chao1~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Chao1 by Tooth_Class
## W = 58948, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -21.99995 -15.99996
## sample estimates:
## difference in location
## -18.99996
wilcox.test(Shannon~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Shannon by Tooth_Class
## W = 82205, p-value = 1.015e-09
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.2919400 -0.1525899
## sample estimates:
## difference in location
## -0.2219933
wilcox.test(Simpson~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Simpson by Tooth_Class
## W = 90641, p-value = 4.082e-05
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.03257463 -0.01132578
## sample estimates:
## difference in location
## -0.0217224
#### 1. Compute the third-order orthogonal polynomial terms
#create validation datset #2 for time x space analysis, all teeth, healthy subjects
keep <- c("1-101", "1-102", "1-103", "1-104", "1-105", "1-106", "1-107")
FullMouth <- subset_samples(supra, Subject %in% keep & Protocol=="Clinic")
FullMouth <- prune_taxa(taxa_sums(FullMouth) > 0, FullMouth)
FullMouth
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 469 taxa and 1701 samples ]
## sample_data() Sample Data: [ 1701 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 469 taxa by 10 taxonomic ranks ]
#filter the data
filtergroup = filterfun(kOverA(k=200, A=1)) #k = number of samples; A = abundance
f1 = filter_taxa(FullMouth, filtergroup, prune=TRUE)
f1 = prune_taxa(taxa_sums(f1) > 0, f1)
f1 = prune_samples(sample_sums(f1) > 0, f1)
f1
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 117 taxa and 1701 samples ]
## sample_data() Sample Data: [ 1701 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 117 taxa by 10 taxonomic ranks ]
#fix the variable labeling for prettier plotting
sample_data(f1)$Tooth_Class <- str_replace_all(sample_data(f1)$Tooth_Class, "(Incisor_Central)", "Incisor")
sample_data(f1)$Tooth_Class <- str_replace_all(sample_data(f1)$Tooth_Class, "(Incisor_Lateral)", "Incisor")
sample_data(f1)$Tooth_Class <- str_replace_all(sample_data(f1)$Tooth_Class, "(Molar_Pre)", "Premolar")
#hellinger transform the data
library(vegan)
otus = data.frame(otu_table(f1))
otus.h = decostand(otus, "hellinger")
map = sample_data(f1)
map$x = as.numeric(as.character(map$x))
map$y = as.numeric(as.character(map$y))
#center the coordinates
xygrid = cbind(map$x, map$y)
xygrid.c <- scale(xygrid, scale=FALSE)
### Compute the third-order orthogonal polynomial function using centered geographic coordinates
poly.xy3 <- poly(xygrid.c, degree = 3, raw=FALSE) #, coefs=TRUE)
colnames(poly.xy3) <- c("X", "X^2", "X^3", "Y", "XY", "X^2Y", "Y^2", "XY^2", "Y^3")
poly.xy3.df <- data.frame(poly.xy3, map$x, map$y)
library(ade4)
##
## Attaching package: 'ade4'
## The following object is masked from 'package:GenomicRanges':
##
## score
## The following object is masked from 'package:IRanges':
##
## score
## The following object is masked from 'package:BiocGenerics':
##
## score
#perform the RDA on the hellinger transformed data; extract the coordinates
rld.pca <- dudi.pca(otus.h , center=TRUE, scale=TRUE, scannf=FALSE, nf=10)
rld.xy3 <- pcaiv(rld.pca, poly.xy3, scannf = FALSE, nf = 6)
rld.xy3.df <- data.frame(rld.xy3$ls, map)
xy3.df <- data.frame(rld.xy3.df, poly.xy3)
# how much of the variance does the trend surface model explain?
rld.xy3.var <- sum(rld.xy3$eig)/sum(rld.pca$eig)*100
rld.xy3.var
## [1] 3.449022
#look at the eigenvalues
rld.xy3$eig
## [1] 1.89291876 1.01339151 0.59150811 0.18315651 0.11501567 0.08787059
## [7] 0.05979274 0.04891794 0.04278420
#look at the screeplot
screeplot(rld.xy3) #we should look at the first 3 axes
#what is the percent of ewxplained variance
Explainedvariance = rld.xy3$eig/sum(rld.xy3$eig)*100
Explainedvariance
## [1] 46.908346 25.112815 14.658139 4.538794 2.850199 2.177518 1.481722
## [8] 1.212234 1.060234
#force x, y to numeric
xy3.df$x <- as.numeric(as.character(xy3.df$x))
xy3.df$y <- as.numeric(as.character(xy3.df$y))
### Force variables to order in ggplot
order=1:32
xy3.df$Tooth_Number <- factor(xy3.df$Tooth_Number, as.character(order))
xy3.df$tclass = xy3.df$Tooth_Class
xy3.df$tclass = str_replace_all(xy3.df$tclass, "Incisor_Central", "Incisor")
xy3.df$tclass = str_replace_all(xy3.df$tclass, "Incisor_Lateral", "Incisor")
#plot axis 1
cbPalette <- c("grey76", "Salmon", "#00BFC4", "grey76")
a1a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis1, fill=tclass)) + theme_bw() + ylab("Axis 1") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth") + ylim(-8, 8) + geom_boxplot()+theme(legend.position="none")+ scale_fill_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
#plot axis 2
a2a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis2, fill=tclass)) + theme_bw() + ylab("Axis 2") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth") + ylim(-8, 8) +theme(legend.position="none") + geom_boxplot()+ scale_fill_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
#plot axis 3
a3a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis3, fill=tclass)) + theme_bw() + ylab("Axis 3") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth") + ylim(-8, 8) +theme(legend.position="none") + geom_boxplot()+ scale_fill_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
#plot axis 4
a4a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis4, fill=tclass)) + theme_bw() + ylab("Axis 4") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth") + ylim(-8, 8) +theme(legend.position="none") + geom_boxplot()+ scale_fill_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
grid.arrange(a1a, a2a, a3a, a4a, ncol=2)
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
## Warning: Removed 26 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
#how many comparisons are there in the boxplots
#buccal
b = subset(xy3.df, Tooth_Aspect=="Buccal")
table(b$Tooth_Class)
##
## Canine Incisor Molar Premolar
## 126 249 249 227
#lingual
l = subset(xy3.df, Tooth_Aspect=="Lingual")
table(l$Tooth_Class)
##
## Canine Incisor Molar Premolar
## 125 250 248 227
a1a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis1, fill=tclass)) + theme_bw() + ylab("Axis 1") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth") + ylim(-8, 8) + theme(plot.title = element_text(size=12)) + geom_boxplot()+theme(legend.position="none")+scale_fill_manual(values=cbPalette)
a1a
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
c1 = ggplot(xy3.df, aes(Axis2, Axis1, colour=Tooth_Class)) + theme_bw() + geom_point() + guides(color=guide_legend(title="Tooth Class")) + theme(legend.position="none")+ scale_color_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
c1
ggsave(a1a, file="~/Dropbox/Figure2a.eps", width = 3.5, height = 6, units ="in",dpi = 300, device="eps")
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
#ggsave(a1a, file="~/Desktop/Figure2b.png", width = 3.5, height = 6, units ="in",dpi = 300, device="png")
#third order non-orthogonal polynomial
fullmouth.trendnon.rda <- rda(otus.h ~., data=as.data.frame(poly.xy3))
(R2adj.poly <- RsquareAdj(fullmouth.trendnon.rda)$adj.r.squared)
## [1] 0.03295887
# do the forward selection
library(packfor)
## packfor: R Package for Forward Selection (Canoco Manual p.49)
## version0.0-8
fullmouth.trend.fwd <- forward.sel(otus.h, poly.xy3, adjR2thresh=R2adj.poly)
## Testing variable 1
## Testing variable 2
## Testing variable 3
## Testing variable 4
## Testing variable 5
## Testing variable 6
## Testing variable 7
## Procedure stopped (R2more criteria): variable 7 explains only 0.000938 of the variance.
kable(fullmouth.trend.fwd)
| variables | order | R2 | R2Cum | AdjR2Cum | F | pval |
|---|---|---|---|---|---|---|
| X^2 | 2 | 0.0116277 | 0.0116277 | 0.0110460 | 19.987961 | 0.001 |
| Y^2 | 7 | 0.0089643 | 0.0205920 | 0.0194384 | 15.541338 | 0.001 |
| Y | 4 | 0.0079510 | 0.0285430 | 0.0268256 | 13.889299 | 0.001 |
| Y^3 | 9 | 0.0032835 | 0.0318266 | 0.0295431 | 5.751951 | 0.001 |
| X^2Y | 6 | 0.0027969 | 0.0346235 | 0.0317757 | 4.910795 | 0.001 |
| X^3 | 3 | 0.0012532 | 0.0358767 | 0.0324618 | 2.201905 | 0.012 |
#write.csv(fullmouth.trend.fwd, file="~/Dropbox/Figures_20170617/Revised/FinalFigures/TableS5.csv")
#perform a new RDA using the significant terms
fullmouth.trend.rda2 <- rda(otus.h~., data=as.data.frame(poly.xy3)[,fullmouth.trend.fwd[,2]])
#what is the R2 of the new model
(R2adj.fwd <- RsquareAdj(fullmouth.trend.rda2)$adj.r.squared)
## [1] 0.03246182
set.seed(990)
#test the significance of the model
fm.aov = anova.cca(fullmouth.trend.rda2, step=1000)
fm.aov
## Permutation test for rda under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: rda(formula = otus.h ~ `X^2` + `Y^2` + Y + `Y^3` + `X^2Y` + `X^3`, data = as.data.frame(poly.xy3)[, fullmouth.trend.fwd[, 2]])
## Df Variance F Pr(>F)
## Model 6 0.01555 10.506 0.001 ***
## Residual 1694 0.41791
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test the significance of the individual axes
set.seed(88)
fm.aov.axes = anova.cca(fullmouth.trend.rda2, step=1000, by="axis")
fm.aov.axes
## Permutation test for rda under reduced model
## Forward tests for axes
## Permutation: free
## Number of permutations: 999
##
## Model: rda(formula = otus.h ~ `X^2` + `Y^2` + Y + `Y^3` + `X^2Y` + `X^3`, data = as.data.frame(poly.xy3)[, fullmouth.trend.fwd[, 2]])
## Df Variance F Pr(>F)
## RDA1 1 0.00710 28.7741 0.001 ***
## RDA2 1 0.00465 18.8418 0.001 ***
## RDA3 1 0.00222 8.9942 0.001 ***
## RDA4 1 0.00091 3.6707 0.006 **
## RDA5 1 0.00042 1.6839 0.158
## RDA6 1 0.00026 1.0718 0.355
## Residual 1694 0.41791
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the 5 significant spatial structures (i.e., canonical axes)
#note that scaling to 1 displays the spatial model (i.e., the linear combo of spatial variables while preserving the Euclidean distance among sites)
fullmouth.trend.fit <- scores(fullmouth.trend.rda2, choices=c(1, 2, 3, 4), display="lc", scaling=1)
fit = data.frame(fullmouth.trend.fit, sample_data(f1))
fit$x = as.numeric(as.character(fit$x))
fit$y = as.numeric(as.character(fit$y))
library(RColorBrewer)
myPalette = colorRampPalette(brewer.pal(11, "RdBu"), space="Lab")
#plot the first axis
p1 = ggplot(fit, aes(x, y, color=RDA1)) +ggtitle("RDA1") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))
#plot the second axis
p2 = ggplot(fit, aes(x, y, color=RDA2)) +ggtitle("RDA2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ theme(plot.title = element_text(size=12))
#plot the third axis
p3 = ggplot(fit, aes(x, y, color=RDA3)) +ggtitle("RDA3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ theme(plot.title = element_text(size=12))
#plot the 4th axis
p4 = ggplot(fit, aes(x, y, color=RDA4)) +ggtitle("RDA4") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ theme(plot.title = element_text(size=12))
#visualize the significant axes fit
grid.arrange(p1, p2, p3,p4, ncol=4)
#ggsave(grid.arrange(p1, p2, p3,p4,ncol=2), file="~/Dropbox/Figures_20170617/Revised/FinalFigures/FigureS7a.png", width = 10, height = 9, units ="in",dpi = 300, device="png")
sig = fullmouth.trend.fwd$variables
sig = str_replace_all(sig, "[(^)]", ".")
xy.sig = data.frame(poly.xy3)[sig]
#subset on the significant polynomial terms
df = as.data.frame(poly.xy3)[,fullmouth.trend.fwd[,2]]
df = data.frame(df, sample_data(f1))
df$x = as.numeric(as.character(df$x))
df$y = as.numeric(as.character(df$y))
fullmouth.trend.fwd$F.Stat = round(fullmouth.trend.fwd$F, 2)
#plot the first axis
t1 = ggplot(df, aes(x, y, color=X.2)) +ggtitle("X^2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))
t2 = ggplot(df, aes(x, y, color=Y.2)) +ggtitle("Y^2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))
t3 = ggplot(df, aes(x, y, color=Y)) +ggtitle("Y") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))
t4 = ggplot(df, aes(x, y, color=Y.3)) +ggtitle("Y^3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))
t5 = ggplot(df, aes(x, y, color=X.2Y)) +ggtitle("X^2*Y") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))
t6 = ggplot(df, aes(x, y, color=X.3)) +ggtitle("X^3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))
grid.arrange(t1, t2, t3,t4, t5, t6, ncol=3)
#ggsave(grid.arrange(t1, t2, t3,t4, t5, t6, ncol=2), file="~/Dropbox/Figures_20170617/Revised/FinalFigures/FigureS7b.png", width = 10, height = 9, units ="in",dpi = 300, device="png")
#merge the temporal replicates within each subject to avoid temporal pseudoreplication
subs <- levels(sample_data(f1)$Subject)
holder <-vector('list',length(subs))
for(i in 1:length(subs)){
subx = subset_samples(f1, Subject==subs[[i]])
subh = merge_samples(subx, "Habitat")
sample_data(subh)$Subject = subs[[i]]
sample_data(subh)$Habitat = sample_names(subh)
sample_names(subh) = paste0(sample_data(subh)$Subject, "_", sample_names(subh))
holder[[i]] = subh
}
is = merge_phyloseq(holder[[1]], holder[[2]], holder[[3]], holder[[4]], holder[[5]], holder[[6]], holder[[7]])
is2 = prune_taxa(taxa_sums(is) > 0, is)
is2 = prune_samples(sample_sums(is2) > 0, is2)
is2
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 117 taxa and 384 samples ]
## sample_data() Sample Data: [ 384 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 117 taxa by 10 taxonomic ranks ]
#set up a new mapping file to correct what was lost during the merge
foo = sample_names(is2)
foo2 <- str_replace_all(foo, "([B,L])", replacement =c(".Buccal", ".Lingual"))
foo2 <- str_replace_all(foo2, "_", ".")
map <- colsplit(foo2, "([.])", c("Subject", "Junk", "Tooth_Number", "Tooth_Aspect"))
map = data.frame(sample_data(is2)$Habitat, map)
colnames(map)[1] = "Habitat"
#merge in the mapping file information
dot = read.csv("~/Desktop/Proctor_NatureComm/mytoothdot_coordinates_FullMouth.csv")
colnames(dot)[1] = "Habitat"
#merge maps
library(plyr)
map2 = join(map, dot, by="Habitat")
#make the sample names of map2 the same as the names in bs2 then replace the mapping file
names = paste0(map2$Subject, "_", map2$Habitat)
rownames(map2) = names
sample_data(is2) = map2
#subset on the most abundant taxa for ease of visualization
library(ggcorrplot)
sub = names(sort(taxa_sums(is2), TRUE)[1:70])
sub2 = prune_taxa(sub, is2)
#stabilize the variance
sub2_VST = sub2
otu_table(sub2_VST) <- otu_table(sub2_VST) + 1
sub2.ds = phyloseq_to_deseq2(sub2_VST, ~Tooth_Class)
## converting counts to integer mode
#variance stabilizing transformation
sub2_vst <- varianceStabilizingTransformation(sub2.ds , blind=FALSE, fitType="local")
counts_VST <- otu_table(as.matrix(assay(sub2_vst)), taxa_are_rows=TRUE)
otu_table(sub2_VST) <- counts_VST
#change the names
tax = data.frame(tax_table(sub2))
names = tax$Highest.Rank
otus= data.frame(otu_table(sub2))
#generate the corrrelation matrix and assign the taxa names to rows and columns
cormat = round(cor(otus), 1)
dim(cormat)
## [1] 70 70
rownames(cormat) = names
colnames(cormat) = names
#generate pvalues
p.mat <- cor_pmat(otus)
p=ggcorrplot(cormat, hc.order = TRUE, type="full", hc.method = "average", lab_size = 6, tl.cex=6, sig.level=0005, colors = c("turquoise1", "white", "coral")) + theme(text = element_text(size=12), axis.text.x = element_text(angle=90, vjust=1)) +theme(legend.position="bottom")
#test figure
p
ggsave(p, file="~/Dropbox/Figure2b.eps", width = 10, height = 8, units ="in",dpi = 300, device="eps")
keep = c("Corynebacterium durum", "Rothia dentocariosa", "Veillonella dispar str. 01", "Haemophilus parainfluenzae", "Rothia aeria")
#rename taxa names with highest rank
tax = data.frame(tax_table(is2))
temp = is2
taxa_names(temp) = tax$Highest.Rank
sub1 = prune_taxa(keep, temp)
CS1.phys = sub1
#get the data into data.frame
CS1.df <- data.frame(tax_table(CS1.phys), t(otu_table(CS1.phys)),
taxa_names(CS1.phys))
colnames(CS1.df)[395] <- "RSV"
CS1.dfm <- melt(CS1.df, id.var=c("RSV", colnames(tax_table(CS1.phys))))
#make a denovo mapping file to fix what was lost during merge
map1 = read.csv("~/Desktop/Proctor_NatureComm/mytoothdot_coordinates_FullMouth.csv")
colnames(map1)[1] <- "Habitat"
fm = CS1.dfm$variable
fm1 = str_replace_all(fm, "_S", ".S")
fm2 = colsplit(fm1, "[(.)]", c("J1", "J2", "Habitat"))
CS1.dfm$Habitat = fm2$Habitat
CS1.dfm = join(CS1.dfm, map1, by="Habitat")
CS1.dfm$Tooth_Number <- as.factor(CS1.dfm$Tooth_Number)
#re-create the subject variable
foo = CS1.dfm$variable
foo2 = colsplit(foo, "[(_)]", c("Subject", "Site"))
foo2 = str_replace_all(foo2$Subject, "X", "")
CS1.dfm$Subject = foo2
CS1.dfm$value = CS1.dfm$value + 1
f=ggplot(CS1.dfm, aes(as.numeric(Tooth_Number), value, group=Highest.Rank, color=Highest.Rank)) + geom_smooth(se=TRUE) + theme_bw() + facet_wrap(~Tooth_Aspect, ncol=1)+ scale_y_continuous(trans='log2') + xlab("Tooth Number") + ylab("Log2 Abundance") + guides(color=guide_legend(title="Taxon")) + scale_x_continuous(breaks = seq(from=0, to=30, by=5), labels=seq(from=0, to=30, by=5))+theme(legend.position="none")
f
## `geom_smooth()` using method = 'loess'
ggsave(f, file="~/Dropbox/Figure2c.png", width = 3.5, height = 6, units ="in",dpi = 300, device="png")
## `geom_smooth()` using method = 'loess'
#note: i printed the p and f plots and put together the figure in powerpoint
The ordination method presented so far demonstrate that the data conform to a spatial pattern, but none of these models explicitly model the variance as a function of geographic location. Moreover, we also wanted to know whether communities inhabiting the mucosal surfaces (e.g., buccal mucosa, alveolar mucosa, keratinized gingiva) also vary along an ecological gradient.
To examine the variation of OTUs across a spatial gradient, we performed another PCA-IV, but instead of constraining by a factor (e.g., Habitat), we constrained by a third order polynomial function of the geographic coordinates. - Constrain ordination of the PCA Correlation Matrix by the polynomial function - Polynomial function: Y ~ X + X^2 + X^3 + Y + XY + X^2Y + Y^2 + XY^2 + Y^3 - Maximize the variance explained by the polynomial terms - Sometimes called Trend Surface Analysis - We see that the first 4 axes explain a chunk of the variation
#convert the biogeography datatset to trelative abundance
bioh.otus = data.frame(otu_table(Biogeo2_phys15))
bioh.euc = vegdist(bioh.otus, "euclidean")
bioh.nmds = cmdscale(bioh.euc, eig=TRUE, k=10)
bioh.map = data.frame(sample_data(Biogeo2_phys15), scores(bioh.nmds))
h.map$method = "Hellinger"
h.eig = 100*(h.nmds$eig/sum(h.nmds$eig))
evals = h.nmds$eig
head(h.eig)
## [1] 23.478257 10.985589 9.710830 7.991654 6.795043 5.868218
df = data.frame(h.eig[1:10], 1:10)
colnames(df) = c("Percent.Variation", "rank")
p=ggplot(df, aes(rank, Percent.Variation)) + geom_point() + theme_bw()
p
#coordinates
map <- sample_data(Biogeo2_VST)
coo <- cbind(map$x, map$y)
colnames(coo) <- c("x", "y")
#hellinger transform the data
otus = otu_table(Biogeo2_phys15)
otus.h = decostand(otus, "hellinger")
### Perform PCA-IV with respect to a 3rd order polynomial of geographic coordinates
poly.xy3 <- poly(coo, degree = 3, raw=FALSE) #, coefs=TRUE)
colnames(poly.xy3) <- c("X", "X^2", "X^3", "Y", "XY", "X^2Y", "Y^2", "XY^2", "Y^3")
poly.xy3.df <- data.frame(poly.xy3, map$x, map$y)
library(ade4)
#totus <- data.frame(t(otu_table(Biogeo2_VST)))
rld.pca <- dudi.pca(otus.h, center=TRUE, scale=TRUE, scannf=FALSE, nf=10)
rld.xy3 <- pcaiv(rld.pca, poly.xy3, scannf = FALSE, nf = 6)
rld.xy3.df <- data.frame(rld.xy3$ls, map)
xy3.df <- data.frame(rld.xy3.df, poly.xy3)
# how much of the variance does this model explain?
rld.xy3.var <- sum(rld.xy3$eig)/sum(rld.pca$eig)*100
rld.xy3.var
## [1] 10.48273
# get the eigen values
pca.scree <- data.frame(rank = 1:length(rld.xy3$eig))
Explainedvariance = rld.xy3$eig/sum(rld.xy3$eig)*100
Explainedvariance
## [1] 49.350953 20.839256 16.951392 4.386053 2.489342 2.162378 1.558638
## [8] 1.243442 1.018547
#how much variance does the first axis explain?
(Explainedvariance[1]/100)*rld.xy3.var
## [1] 5.173326
#force x, y to numeric
xy3.df$x <- as.numeric(as.character(xy3.df$x))
xy3.df$y <- as.numeric(as.character(xy3.df$y))
### Force variables to order in ggplot
xy3.df$Tooth<- as.numeric(as.character(xy3.df$Tooth))
#order the sites for plotting
order <- c("Alveolar Mucosa ", "Keratinized Gingiva ", "Supragingival Tooth ", "Buccal Mucosa ")
xy3.df$Site.New <- factor(xy3.df$Site.New, as.character(order))
xy3.df$Class = str_replace_all(xy3.df$Class, "CentralIncisor", "Incisor")
xy3.df$Class = str_replace_all(xy3.df$Class, "LateralIncisor", "Incisor")
cbPalette <- c("grey76", "Salmon", "#00BFC4", "grey76")
#plot Axis 1 over teeth
fig3 = ggplot(xy3.df, aes(Tooth, Axis1)) +geom_smooth(fill = "gray90", alpha=1) + geom_point(aes(colour=Class)) + scale_colour_manual(values=cbPalette) +facet_wrap(Aspect~Site.New, ncol=4, scales="free_y") + theme_bw() + ylab("Axis 1") + xlab("Tooth") + ggtitle("")+ theme(plot.title = element_text(size=15)) +scale_x_continuous(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5))
ggsave(fig3, file="~/Dropbox/Figure3.eps", width = 10, height = 8, units ="in",dpi = 300, device="eps")
## `geom_smooth()` using method = 'loess'
print(fig3)
## `geom_smooth()` using method = 'loess'
#third order non-orthogonal polynomial
fullmouth.trendnon.rda <- rda(otus.h ~., data=as.data.frame(poly.xy3))
(R2adj.poly <- RsquareAdj(fullmouth.trendnon.rda)$adj.r.squared)
## [1] 0.07541264
# do the forward selection
library(packfor)
fullmouth.trend.fwd <- forward.sel(otus.h, poly.xy3, adjR2thresh=R2adj.poly)
## Testing variable 1
## Testing variable 2
## Testing variable 3
## Testing variable 4
## Testing variable 5
## Testing variable 6
## Procedure stopped (adjR2thresh criteria) adjR2cum = 0.075776 with 6 variables (superior to 0.075413)
fullmouth.trend.fwd
## variables order R2 R2Cum AdjR2Cum F pval
## 1 Y^2 7 0.019646491 0.01964649 0.01783102 10.821714 0.001
## 2 X^2 2 0.027058934 0.04670543 0.04316815 15.299327 0.001
## 3 Y 4 0.015807555 0.06251298 0.05728536 9.071555 0.001
## 4 Y^3 9 0.008504480 0.07101746 0.06409766 4.916030 0.001
## 5 X^2Y 6 0.010867203 0.08188466 0.07332016 6.344324 0.001
## 6 X 1 0.004141985 0.08602665 0.07577648 2.424537 0.019
#perform a new RDA using the significant terms
fullmouth.trend.rda2 <- rda(otus.h~., data=as.data.frame(poly.xy3)[,fullmouth.trend.fwd[,2]])
#what is the R2 of the new model
(R2adj.fwd <- RsquareAdj(fullmouth.trend.rda2)$adj.r.squared)
## [1] 0.07577648
#test the significance of the model
set.seed(44)
bio.aov = anova.cca(fullmouth.trend.rda2, step=1000)
bio.aov
## Permutation test for rda under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: rda(formula = otus.h ~ `Y^2` + `X^2` + Y + `Y^3` + `X^2Y` + X, data = as.data.frame(poly.xy3)[, fullmouth.trend.fwd[, 2]])
## Df Variance F Pr(>F)
## Model 6 0.03557 8.3927 0.001 ***
## Residual 535 0.37786
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test the significance of the individual axes
set.seed(99)
axes.aov = anova.cca(fullmouth.trend.rda2, step=1000, by="axis")
axes.aov
## Permutation test for rda under reduced model
## Forward tests for axes
## Permutation: free
## Number of permutations: 999
##
## Model: rda(formula = otus.h ~ `Y^2` + `X^2` + Y + `Y^3` + `X^2Y` + X, data = as.data.frame(poly.xy3)[, fullmouth.trend.fwd[, 2]])
## Df Variance F Pr(>F)
## RDA1 1 0.01569 22.2202 0.001 ***
## RDA2 1 0.00898 12.7122 0.001 ***
## RDA3 1 0.00705 9.9820 0.001 ***
## RDA4 1 0.00184 2.6011 0.102
## RDA5 1 0.00133 1.8866 0.185
## RDA6 1 0.00067 0.9541 0.435
## Residual 535 0.37786
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the 5 significant spatial structures (i.e., canonical axes)
#note that scaling to 1 displays the spatial model (i.e., the linear combo of spatial variables while preserving the Euclidean distance among sites)
fullmouth.trend.fit <- scores(fullmouth.trend.rda2, choices=c(1, 2, 3, 4, 5), display="lc", scaling=1)
fit = data.frame(fullmouth.trend.fit, sample_data(Biogeo2_phys15))
fit$x = as.numeric(as.character(fit$x))
fit$y = as.numeric(as.character(fit$y))
library(RColorBrewer)
myPalette = colorRampPalette(brewer.pal(11, "RdBu"), space="Lab")
#plot the first axis
p1 = ggplot(fit, aes(x, y, color=RDA1)) +ggtitle("RDA1") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + facet_wrap(~Site.New, ncol=4)
#plot the second axis
p2 = ggplot(fit, aes(x, y, color=RDA2)) +ggtitle("RDA2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)
#plot the third axis
p3 = ggplot(fit, aes(x, y, color=RDA3)) +ggtitle("RDA3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)
grid.arrange(p1, p2, p3, ncol=3)
xy3.df$Site.New = str_replace_all(xy3.df$Site.New, "Alveolar Mucosa ", "AM")
xy3.df$Site.New = str_replace_all(xy3.df$Site.New, "Keratinized Gingiva ", "KG")
xy3.df$Site.New = str_replace_all(xy3.df$Site.New, "Supragingival Tooth ", "Tooth")
xy3.df$Site.New = str_replace_all(xy3.df$Site.New, "Buccal Mucosa ", "BM")
a2 = ggplot(xy3.df, aes(Tooth, Axis2)) + theme_bw() + ylab("Axis 2") + facet_wrap(Aspect~Site.New, ncol=4, scales="free_y")+ xlab("Tooth") + xlab("Tooth") +theme(legend.position="none") + theme(plot.title = element_text(size=8)) + geom_smooth() + geom_point() + theme(text = element_text(size=10), axis.text.x = element_text(angle=00, vjust=1))
a3 = ggplot(xy3.df, aes(Tooth, Axis3)) + theme_bw() + ylab("Axis 3") + facet_wrap(Aspect~Site.New, ncol=4, scales="free_y")+ xlab("Tooth") + xlab("Tooth") +theme(legend.position="none") + theme(plot.title = element_text(size=8)) + geom_smooth() + geom_point()+ theme(text = element_text(size=10), axis.text.x = element_text(angle=00, vjust=1))
grid.arrange(a2, a3, ncol=2)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'
#ggsave(grid.arrange(a2, a3, ncol=1), file="~/Dropbox/Figures_20170617/Revised/FinalFigures/FigureS11a.png", width = 6, height = 9, units ="in",dpi = 300, device="png")
sig = fullmouth.trend.fwd$variables
sig = str_replace_all(sig, "[(^)]", ".")
xy.sig = data.frame(poly.xy3)[sig]
#subset on the significant polynomial terms
df = as.data.frame(poly.xy3)[,fullmouth.trend.fwd[,2]]
df = data.frame(df, sample_data(Biogeo2_phys15))
df$x = as.numeric(as.character(df$x))
df$y = as.numeric(as.character(df$y))
dfm = melt(df, id.vars = colnames(sample_data(Biogeo1_phys)))
#plot the first axis
p1 = ggplot(df, aes(x, y, color=X.2)) +ggtitle("X^2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")
p2 = ggplot(df, aes(x, y, color=Y.2)) +ggtitle("Y^2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")
p3 = ggplot(df, aes(x, y, color=Y)) +ggtitle("Y") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")
p4 = ggplot(df, aes(x, y, color=Y.3)) +ggtitle("Y^3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")
p5 = ggplot(df, aes(x, y, color=X.2Y)) +ggtitle("X^2*Y") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")
p6 = ggplot(df, aes(x, y, color=X)) +ggtitle("X") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")
grid.arrange(p1, p2, p3,p4, p5, p6, ncol=2)
#ggsave(grid.arrange(p1, p2, p3,p4, p5, p6, ncol=2),file="~/Dropbox/Figures_20170617/Revised/FinalFigures/FigureS11b.png", width = 10, height = 9, units ="in",dpi = 300, device="png")
First, let’s take a look at the differences between the salivary flow rates of the control and SS populations. We’ll look at the Unstimulated Whole Saliva Flow Rate (UWS-FR) as well as the stimulated whole salivary flow rate (SWS-FR).
#read in the pids
data1 = read.csv("~/Desktop/Proctor_NatureComm/pids.csv")
redcap.id = levels(as.factor(data1$redcap_record_num))
#read in the flow rates
data2 = read.csv("/Users/dmap/Desktop/Proctor_NatureComm/flow_rates_20170710_v2.0.csv")
#merge the pids with the flow rates
data3 = join(data1, data2, "redcap_record_num")
#add aim variable
sa1 = c(17 , 18, 36 , 39 , 49 , 63 , 170)
data3$aim = ifelse(data3$redcap_record_num %in% sa1, "Control", "Sjogrens")
data3$dcoll_sws_flowrate = as.numeric(as.character(data3$dcoll_sws_flowrate))
data3$dcoll_uws_flowrate = as.numeric(as.character(data3$dcoll_uws_flowrate))
#look at it
head(data3)
## redcap_record_num pid X redcap_event_name dcoll_uws_flowrate
## 1 17 1-101 8 d00_screening_exam_arm_1 NA
## 2 17 1-101 23 d01_arm_1 0.4270000
## 3 17 1-101 35 d08_arm_1 0.6791350
## 4 17 1-101 42 d15_arm_1 0.5094678
## 5 17 1-101 48 d22_arm_1 0.6074000
## 6 17 1-101 53 d29_arm_1 0.3364000
## dcoll_sws_flowrate escr_uws_fr Day Notes aim
## 1 NA 0.611 0 None Control
## 2 7.207034 <NA> 1 None Control
## 3 6.735644 <NA> 8 None Control
## 4 7.284400 <NA> 15 None Control
## 5 7.228015 <NA> 22 None Control
## 6 4.458800 <NA> 29 None Control
#how correlated is UWS-FR with SWS-FR?
p2 =ggplot(data3, aes(dcoll_uws_flowrate, dcoll_sws_flowrate, color=aim)) + geom_point() + theme_bw() + ggtitle("Correlation UWS-FR: SWS-FR")
p2
cor.test(data3$dcoll_uws_flowrate, data3$dcoll_sws_flowrate, method="pearson", exact=TRUE,na.rm=TRUE)
##
## Pearson's product-moment correlation
##
## data: data3$dcoll_uws_flowrate and data3$dcoll_sws_flowrate
## t = 4.5063, df = 35, p-value = 7.057e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3507946 0.7773392
## sample estimates:
## cor
## 0.6059399
#now let's look at some boxplot
dfm = melt(data3, id.var=c("redcap_record_num", "pid", "redcap_event_name", "Notes", "Day", "aim"))
dfm = subset(dfm, value !="NA")
dfm = subset(dfm, value !="ND")
dfm$value = as.numeric(as.character(dfm$value))
head(dfm)
## redcap_record_num pid redcap_event_name Notes Day aim
## 1 17 1-101 d00_screening_exam_arm_1 None 0 Control
## 2 17 1-101 d01_arm_1 None 1 Control
## 3 17 1-101 d08_arm_1 None 8 Control
## 4 17 1-101 d15_arm_1 None 15 Control
## 5 17 1-101 d22_arm_1 None 22 Control
## 6 17 1-101 d29_arm_1 None 29 Control
## variable value
## 1 X 8
## 2 X 23
## 3 X 35
## 4 X 42
## 5 X 48
## 6 X 53
#get the mean flow rates
library(doBy)
#make an uws-fr df
uws = subset(dfm, variable %in% c("dcoll_uws_flowrate", "escr_uws_fr"))
#make a sws-fr df
sws = subset(dfm, variable == "dcoll_sws_flowrate")
#get significance of between group means - UWS
a1_uws = subset(uws, aim=="Control")
a3_uws = subset(uws, aim=="Sjogrens")
t.test(a1_uws$value,a3_uws$value)
##
## Welch Two Sample t-test
##
## data: a1_uws$value and a3_uws$value
## t = 8.3234, df = 36.912, p-value = 5.403e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.3121027 0.5129693
## sample estimates:
## mean of x mean of y
## 0.5201955 0.1076595
#get significance of between group means - UWS
a1_sws = subset(sws, aim=="Control")
a3_sws = subset(sws, aim=="Sjogrens")
t.test(a1_sws$value,a3_sws$value)
##
## Welch Two Sample t-test
##
## data: a1_sws$value and a3_sws$value
## t = 6.3965, df = 15.333, p-value = 1.083e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.783452 5.557657
## sample estimates:
## mean of x mean of y
## 6.131839 1.961285
#generate the average subject value for the flow rates in aim 1 since we have repeated measures and don't want those to be treated as if they were independent in the t.test
uws_df = summaryBy(value~pid+redcap_record_num, data=uws, FUN=mean, na.rm=TRUE)
sws_df = summaryBy(value~pid+redcap_record_num, data=sws, FUN=mean, na.rm=TRUE)
#merge the mean uws-fr, sws-fr, pid and microbial data
frs = join (uws_df, sws_df, by=c("pid", "redcap_record_num"))
colnames(frs)[1] = c("Subject")
colnames(frs)[3:4] = c("UWS_FR", "SWS_FR")
### Merge the flow rate data with the microbial data
flow <- subset_samples(supra, Day=="1" & Protocol=="Clinic")
flow # 480 taxa and 923 samples
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 480 taxa and 923 samples ]
## sample_data() Sample Data: [ 923 samples by 30 sample variables ]
## tax_table() Taxonomy Table: [ 480 taxa by 10 taxonomic ranks ]
flow.map <- data.frame(sample_data(flow))
#merge the sequencing data with the clinical data
flow.map2 = join(flow.map, frs, "Subject")
#read in the caries data and then subset on the redcap ids that are included in the dataset
data = read.csv("~/Desktop/Proctor_NatureComm/data_caries_combined_v1.2_20170328.csv")
colnames(data)[3] = "redcap_record_num"
data1 = subset(data, redcap_record_num %in% redcap.id)
#subset on buccal and lingual (excluding the occlusal surfaces as well as the mesial- and distal- sites) and get rid of wisdom teeth
bl = subset(data1, surface %in% c("B", "L"))
nowise = c(2:15, 18:31)
bl = subset(bl, tooth %in% nowise)
#convert disease to 1, sound to 0
bl$binary = ifelse(bl$description =="SOUND", 0, 1)
bl$binary = as.factor(as.numeric(bl$binary))
#count the number of decayed missing teeth per subject
bl.sum = summaryBy(binary~redcap_record_num, data=bl, FUN=sum)
bl.sum$prop = bl.sum/28
colnames(bl)[3]= "redcap_record_num"
colnames(bl)[8]= "Tooth_Number"
colnames(bl)[9]= "Tooth_Aspect"
bl$Tooth_Aspect = str_replace_all(bl$Tooth_Aspect, "([B,L])", replacement =c("Buccal", "Lingual"))
bl = bl[,3:11]
#join the caries data with the microbial data
total3 = join(flow.map2, bl, b=c("redcap_record_num", "Tooth_Number", "Tooth_Aspect"))
total3$x <- as.numeric(as.character(total3$x))
total3$y <- as.numeric(as.character(total3$y))
rownames(total3) = total3[,1]
total3 = sample_data(total3)
# relabel the subjects so they will correspond to the way they are labeled in figure 5
#relabel controls that have symetrical gradients
total3$Subject = str_replace_all(total3$Subject, "1-101", "Control 01")
total3$Subject = str_replace_all(total3$Subject, "1-103", "Control 02")
total3$Subject = str_replace_all(total3$Subject, "1-104", "Control 03")
total3$Subject = str_replace_all(total3$Subject, "1-106", "Control 04")
total3$Subject = str_replace_all(total3$Subject, "1-107", "Control 05")
#relabel the controls that have flat maxillary lines
total3$Subject = str_replace_all(total3$Subject, "1-102", "Control 06")
total3$Subject = str_replace_all(total3$Subject, "1-105", "Control 07")
#relabels sjogrens that have symetrical curves
total3$Subject = str_replace_all(total3$Subject, "3-305", "Sjogrens 01")
total3$Subject = str_replace_all(total3$Subject, "3-302", "Sjogrens 02")
total3$Subject = str_replace_all(total3$Subject, "3-306", "Sjogrens 03")
#relabel sjogren's that have flat lines
total3$Subject = str_replace_all(total3$Subject, "3-304", "Sjogrens 04")
total3$Subject = str_replace_all(total3$Subject, "3-308", "Sjogrens 05")
total3$Subject = str_replace_all(total3$Subject, "3-310", "Sjogrens 06")
total3$Subject = str_replace_all(total3$Subject, "3-309", "Sjogrens 07")
total3$Subject = str_replace_all(total3$Subject, "3-303", "Sjogrens 08")
total3$Subject = str_replace_all(total3$Subject, "3-307", "Sjogrens 09")
#relabel sjogrens that have exaggerated lines
total3$Subject = str_replace_all(total3$Subject, "3-301", "Sjogrens 10")
#make the total3 variable the new mapping file for the object flow
sample_data(flow) = total3
#how many samples and subjects are there?
flow
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 480 taxa and 923 samples ]
## sample_data() Sample Data: [ 923 samples by 39 sample variables ]
## tax_table() Taxonomy Table: [ 480 taxa by 10 taxonomic ranks ]
levels(sample_data(flow)$Subject) #7 controls; 10 aim 3
## NULL
#get ridof subjects with missing flow rate for the cca
flows = subset_samples(flow, UWS_FR & SWS_FR != "NA")
levels(sample_data(flows)$Subject) #7 controls; lost subject 3-310 because of sws-fr
## NULL
#subset on taxa present in 10% of samples
filtergroup = filterfun(kOverA(k=75, A=1)) #k = number of samples; A = abundance
flows = filter_taxa(flows, filtergroup, prune=TRUE)
flows = prune_taxa(taxa_sums(flows) > 0, flows)
flows = prune_samples(sample_sums(flows) > 0, flows)
flows
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 147 taxa and 825 samples ]
## sample_data() Sample Data: [ 825 samples by 39 sample variables ]
## tax_table() Taxonomy Table: [ 147 taxa by 10 taxonomic ranks ]
#stabilize the variance
flows_VST = flows
otu_table(flows_VST) <- otu_table(flows_VST) + 1
flows.ds = phyloseq_to_deseq2(flows_VST, ~PoolName)
## converting counts to integer mode
#variance stabilizing transformation
flows_vst <- varianceStabilizingTransformation(flows.ds , blind=FALSE, fitType="local")
counts_VST <- otu_table(as.matrix(assay(flows_vst)), taxa_are_rows=TRUE)
otu_table(flows_VST) <- counts_VST
#set an analysis up for CCA
otus = t(otu_table(flows_VST))
map1 = data.frame(sample_data(flows_VST)$Tooth_Class, sample_data(flows_VST)$UWS_FR, sample_data(flows_VST)$SWS_FR, sample_data(flows_VST)$binary)
colnames(map1) = c("Class", "uws_fr", "sws_fr", "dmfs")
map1$Class = as.numeric(map1$Class)
subs = as.vector(sample_data(flows_VST)$Subject)
runs = as.vector(sample_data(flows_VST)$Pool_Name)
#for some reason this wworks on the command line but not in knitr
attach(map1)
flow.cca = vegan::cca(otus ~ map1$Class*map1$uws_fr*map1$sws_fr*map1$dmfs, sitenv=map1)
#run an anova on this
set.seed(100)
flow.aov = anova.cca(flow.cca, by="term", strata=subs)
flow.aov
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus ~ map1$Class * map1$uws_fr * map1$sws_fr * map1$dmfs, sitenv = map1)
## Df ChiSquare F Pr(>F)
## map1$Class 1 0.001240 4.2099 0.001
## map1$uws_fr 1 0.030191 102.5027 0.001
## map1$sws_fr 1 0.008321 28.2490 0.001
## map1$dmfs 1 0.002725 9.2517 0.271
## map1$Class:map1$uws_fr 1 0.000473 1.6048 0.001
## map1$Class:map1$sws_fr 1 0.000418 1.4183 0.004
## map1$uws_fr:map1$sws_fr 1 0.023267 78.9949 0.001
## map1$Class:map1$dmfs 1 0.000458 1.5557 0.247
## map1$uws_fr:map1$dmfs 1 0.001513 5.1381 0.190
## map1$sws_fr:map1$dmfs 1 0.001976 6.7094 0.106
## map1$Class:map1$uws_fr:map1$sws_fr 1 0.000377 1.2793 0.007
## map1$Class:map1$uws_fr:map1$dmfs 1 0.000342 1.1603 0.164
## map1$Class:map1$sws_fr:map1$dmfs 1 0.000551 1.8719 0.187
## map1$uws_fr:map1$sws_fr:map1$dmfs 1 0.001440 4.8885 0.807
## map1$Class:map1$uws_fr:map1$sws_fr:map1$dmfs 1 0.000424 1.4379 0.212
## Residual 809 0.238285
##
## map1$Class ***
## map1$uws_fr ***
## map1$sws_fr ***
## map1$dmfs
## map1$Class:map1$uws_fr ***
## map1$Class:map1$sws_fr **
## map1$uws_fr:map1$sws_fr ***
## map1$Class:map1$dmfs
## map1$uws_fr:map1$dmfs
## map1$sws_fr:map1$dmfs
## map1$Class:map1$uws_fr:map1$sws_fr **
## map1$Class:map1$uws_fr:map1$dmfs
## map1$Class:map1$sws_fr:map1$dmfs
## map1$uws_fr:map1$sws_fr:map1$dmfs
## map1$Class:map1$uws_fr:map1$sws_fr:map1$dmfs
## Residual
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#how much variance does the model explain
eig = flow.cca$CCA$eig
eigp = 100*(eig/sum(eig))
head(eigp)
## CCA1 CCA2 CCA3 CCA4 CCA5 CCA6
## 59.737865 15.292899 11.534283 3.606900 2.646815 2.153165
#how much does the first axis explain?
eigp[1]
## CCA1
## 59.73787
#hwat about the second?
eigp[2]
## CCA2
## 15.2929
#get the sample site scores and plot them
sites = data.frame(flow.cca$CCA$wa, sample_data(flows_VST))
sites$Aim = ifelse(sites$Aim=="SA1", "Control", "Sjogrens")
#convert the MFS score to factor
sites$binary = as.factor(as.numeric(sites$binary))
p1 = ggplot(sites, aes(CCA1, CCA2, color=as.factor(Aim))) + geom_point() + theme_bw() + guides(color=guide_legend(title="Health Status")) + ggtitle("a)") + xlab("CCA1 (59.74%)") + ylab("CCA2 (15.29%)") + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
p2 = ggplot(sites, aes(CCA1, CCA2, color=uws_fr)) + theme_bw() + guides(color=guide_legend(title="UWS-FR")) + ggtitle("b)") + scale_colour_gradientn(colours=c("red", "blue")) + labs(title = "b)" ) + xlab("CCA1 (59.74%)") + ylab("CCA2 (15.29%)")+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) + geom_point()
p3 = ggplot(sites, aes(CCA1, CCA2, color=sws_fr)) + theme_bw() + guides(color=guide_legend(title="SWS-FR")) + ggtitle("c)") + scale_colour_gradientn(colours=c("red", "blue")) + labs(title = "c)" )+ xlab("CCA1 (59.74%)") + ylab("CCA2 (15.29%)")+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))+ geom_point()
p4 = ggplot(sites, aes(CCA1, CCA2, color=as.factor(binary))) + theme_bw() + guides(color=guide_legend(title="MFS")) + ggtitle("d)") + labs(title = "d)" )+ xlab("CCA1 (59.74%)") + ylab("CCA2 (15.29%)")+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))+ geom_point()+ scale_colour_manual(values = c("red", "blue"))
p5 = ggplot(sites, aes(uws_fr, CCA1, color=Aim)) + geom_jitter() +theme_bw() + ggtitle("e)") + ylab("CCA1 (59.74%)") +xlab("UWS-FR")+ guides(color=guide_legend(title="Health Status"))+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
p6 = ggplot(sites, aes(sws_fr, CCA1, color=Aim)) + geom_jitter() +theme_bw() + ggtitle("f)") +xlab("SWS-FR") + ylab("CCA1 (59.74%)") + guides(color=guide_legend(title="Health Status"))+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
fig5= arrangeGrob(p1, p2, p3, p4, p5, p6, ncol=3)
ggsave(grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3), file="~/Dropbox/Figure4.eps", width = 11, height = 8.5, units ="in",dpi = 300)
grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3)
ss = subset(sites, Aim=="Sjogrens")
### how many ss samples map to negative axis 1 scores
neg = subset(ss, CCA1 < 0)
ss.neg= subset(neg, Aim=="Sjogrens")
nrow(ss.neg)/nrow(ss)
## [1] 0.3265306
### how many ss samples map to positive axis 1 scores
pos = subset(ss, CCA1 > 0)
ss.pos= subset(pos, Aim="Sjogrens")
nrow(ss.pos)/nrow(ss)
## [1] 0.6734694
########## look at the distribution of control samples along axis 1
cont = subset(sites, Aim=="Control")
### how many ss samples map to negative axis 1 scores
neg = subset(cont, CCA1 < 0)
nrow(neg)/nrow(cont)
## [1] 0.8880208
### how many ss samples map to positive axis 1 scores
pos = subset(cont, CCA1 > 0)
nrow(pos)/nrow(cont)
## [1] 0.1119792
#how many samples and subjects are there?
flow
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 480 taxa and 923 samples ]
## sample_data() Sample Data: [ 923 samples by 39 sample variables ]
## tax_table() Taxonomy Table: [ 480 taxa by 10 taxonomic ranks ]
levels(sample_data(flow)$Subject) #7 controls; 10 aim 3
## NULL
#stabilize the variance
flows_VST = flow
otu_table(flows_VST) <- otu_table(flows_VST) + 1
flows.ds = phyloseq_to_deseq2(flows_VST, ~PoolName)
## converting counts to integer mode
#variance stabilizing transformation
flows_vst <- varianceStabilizingTransformation(flows.ds , blind=FALSE, fitType="local")
counts_VST <- otu_table(as.matrix(assay(flows_vst)), taxa_are_rows=TRUE)
otu_table(flows_VST) <- counts_VST
#transform sample counts for the PCA-IV
library(ade4)
otus = data.frame(otu_table(flow))
otus.h = decostand(otus, "hellinger")
map <- sample_data(flow)
coo <- cbind(map$x, map$y)
colnames(coo) <- c("x", "y")
### Perform PCA-IV with respect to a 3rd order polynomial of geographic coordinates
poly.xy3 <- poly(coo, degree = 3, raw=FALSE) #, coefs=TRUE)
colnames(poly.xy3) <- c("X", "X^2", "X^3", "Y", "XY", "X^2Y", "Y^2", "XY^2", "Y^3")
poly.xy3.df <- data.frame(poly.xy3, map$x, map$y)
library(ade4)
#totus <- t(otu_table(flows_VST))
rld.pca <- dudi.pca(otus.h, center=TRUE, scale=TRUE, scannf=FALSE, nf=5)
rld.xy3 <- pcaiv(rld.pca, poly.xy3, scannf = FALSE, nf = 2)
rld.xy3.df <- data.frame(rld.xy3$ls, map)
xy3.df <- data.frame(rld.xy3.df, poly.xy3)
# how much of the variance does this model explain?
rld.xy3.var <- sum(rld.xy3$eig)/sum(rld.pca$eig)*100
rld.xy3.var
## [1] 1.55107
# get the eigen values
pca.scree <- data.frame(rank = 1:length(rld.xy3$eig))
Explainedvariance = rld.xy3$eig/sum(rld.xy3$eig)*100
Explainedvariance
## [1] 29.613705 19.335762 13.964370 8.497993 7.825626 6.704618 5.511524
## [8] 4.730248 3.816154
#organize the data for plotting
xy3.df$x <- as.numeric(as.character(xy3.df$x))
xy3.df$y <- as.numeric(as.character(xy3.df$y))
xy3.df$Tooth_Number <- as.numeric(as.character(xy3.df$Tooth_Number))
#force the tooth to order
ordering = 1:32
xy3.df$interval <- factor(xy3.df$Tooth_Number, as.character(ordering))
xy3.df$Jaw = ifelse(xy3.df$Jaw_Quadrant %in% c(1, 2), "Maxilla", "Mandible")
xy3.df$Aim = ifelse(xy3.df$Aim=="SA1", "Control", "Sjogrens")
upper <- 1:2
lower <- 3:4
upper.df <- subset(xy3.df, Jaw_Quadrant %in% upper)
lower.df <- subset(xy3.df, Jaw_Quadrant %in% lower)
#replace subject labels for plotting
xy3.df$Subject = as.factor(xy3.df$Subject)
colnames(xy3.df)[41] = "MFS"
xy3.df$MFS = as.factor(as.numeric(xy3.df$MFS))
xy3.df$Tooth_Class = str_replace_all(xy3.df$Tooth_Class, "Molar_Pre", "Pre-molar")
xy3.df$Tooth_Class = str_replace_all(xy3.df$Tooth_Class, "Incisor_Central", "Central Incisor")
xy3.df$Tooth_Class = str_replace_all(xy3.df$Tooth_Class, "Incisor_Lateral", "Lateral Incisor")
#generate figure
fig5 = ggplot(xy3.df, aes(Tooth_Number, Axis1, color=UWS_FR)) + geom_smooth(fill = "gray90", alpha=1) + facet_wrap(~Subject, ncol=6, scales="free") + theme_bw() + ylab("Axis 1") + guides(fill=guide_legend(title="Health Status")) + xlab("Tooth Number") + geom_point(aes(shape=MFS), size=2) + ggtitle("")+ scale_colour_gradientn(colours=c("red", "blue"))
fig5
## `geom_smooth()` using method = 'loess'
ggsave(fig5, file="~/Dropbox/Figure5.eps", width = 11, height = 8.5, units ="in",dpi = 300, device="eps")
## `geom_smooth()` using method = 'loess'
#save.image(file="Proctor_MainFigures_v28.0.Rdata")
library("ggpubr")
## Loading required package: magrittr
df = fig5$data
df= subset(df, Tooth_Class %in% c("Central Incisor", "Molar"))
buccal = subset(df, Tooth_Aspect=="Buccal")
p=ggplot(df, aes(Tooth_Class, Axis1, color=Tooth_Class)) + geom_boxplot() + facet_wrap(~Subject) + theme_bw()+ theme(text = element_text(size=12), axis.text.x = element_text(angle=90, vjust=1))
p + stat_compare_means(method = "wilcox.test")